#Download the datasets

friends <- read.csv("friends.csv", sep = ",", header = TRUE)
friends_info <- read.csv("friends_info.csv", sep = ",", header = TRUE)
friends_emotions <- read.csv("friends_emotions.csv", sep = ",", header = TRUE)
library(tidyverse)

What is the distribution of lines (measured by the frequency of variable “text”)among the six main characters throughout the 10 seasons? Does any non-main character have more lines than a main character in any episode?

# Define the main characters
main_characters <- c("Monica Geller", "Joey Tribbiani", "Chandler Bing", 
                     "Phoebe Buffay", "Rachel Green", "Ross Geller")

# Calculate the distribution of lines spoken by each main character
line_distribution <- friends |>
  filter(speaker %in% main_characters) |>  # Filter for main characters
  group_by(speaker) |>                     # Group by character
  summarise(total_lines = n()) |>         # Count the number of lines for each character
  arrange(desc(total_lines))                 # Sort by total lines

# Output the line distribution
cat("Distribution of lines spoken by main characters:\n")
## Distribution of lines spoken by main characters:
print(line_distribution)
## # A tibble: 6 Ă— 2
##   speaker        total_lines
##   <chr>                <int>
## 1 Rachel Green          9312
## 2 Ross Geller           9157
## 3 Chandler Bing         8465
## 4 Monica Geller         8441
## 5 Joey Tribbiani        8171
## 6 Phoebe Buffay         7501
# Create a bar plot for the distribution of lines
ggplot(line_distribution, aes(x = reorder(speaker, total_lines), y = total_lines)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(
    title = "Distribution of Lines Spoken by Main Characters in Friends",
    x = "Characters",
    y = "Total Lines"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Define the main characters
main_characters <- c("Monica Geller", "Joey Tribbiani", "Chandler Bing", 
                     "Phoebe Buffay", "Rachel Green", "Ross Geller")

# Calculate the distribution of utterances spoken by each main character
utterance_distribution <- friends |>
  filter(speaker %in% main_characters) |>  # Filter for main characters
  group_by(speaker) |>                     # Group by character
  summarise(total_utterance = sum(utterance, na.rm = TRUE)) |>  # Sum utterances for each character
  arrange(desc(total_utterance))            # Sort by total utterances

# Output the utterance distribution
cat("Distribution of utterances spoken by main characters:\n")
## Distribution of utterances spoken by main characters:
print(utterance_distribution)
## # A tibble: 6 Ă— 2
##   speaker        total_utterance
##   <chr>                    <int>
## 1 Rachel Green            187427
## 2 Ross Geller             182134
## 3 Monica Geller           157199
## 4 Chandler Bing           154488
## 5 Joey Tribbiani          151005
## 6 Phoebe Buffay           132168
# Create a bar plot for the distribution of utterances
ggplot(utterance_distribution, aes(x = reorder(speaker, total_utterance), y = total_utterance)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  theme_minimal() +
  labs(
    title = "Distribution of Utterances Spoken by Main Characters in Friends",
    x = "Characters",
    y = "Total Utterances"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Filter for characters that are not main characters and manully filter out the non charactors
non_main_characters <- friends |>
  filter(!(speaker %in% main_characters) & 
         !is.na(speaker) & 
         speaker != "#ALL#" &
         speaker != "Scene Directions") |>
  group_by(speaker) |>
  summarise(total_lines = n(), 
            total_utterance = sum(utterance, na.rm = TRUE)) |>
  filter(total_lines > 150 ) |>
  arrange(desc(total_lines))

# Create a bar plot for the distribution of lines and utterances
ggplot(non_main_characters, aes(x = reorder(speaker, total_lines), y = total_lines)) +
  geom_bar(stat = "identity", fill = "lightcoral") +
  theme_minimal() +
  labs(
    title = "Non-Main Characters with most lines in Friends",
    x = "Characters",
    y = "Total Lines"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Filter the dataset for the three speakers who are not the main characters and have the most lines. 
speaker_distribution <- friends |>
  filter(speaker %in% c("Janice Litman Goralnik", "Mike Hannigan", "Richard Burke")) |>
  group_by(speaker, season) |>
  summarise(Number_of_Lines = n(), .groups = "drop")

# Create bins to ensure seasons 1 to 10 are represented
speaker_distribution <- speaker_distribution |>
  complete(season = 1:10, speaker, fill = list(Number_of_Lines = 0))

# Plot the distribution of lines for the three speakers
ggplot(speaker_distribution, aes(x = factor(season), y = Number_of_Lines, fill = speaker)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  theme_minimal() +
  labs(
    title = "Line Distribution for Selected Characters Across Seasons",
    x = "Season",
    y = "Number of Lines",
    fill = "Speaker"
  ) +
  scale_fill_manual(values = c("skyblue", "orange", "lightgreen")) +  # Custom colors for speakers
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

# Calculate the distribution of lines spoken by each character (main and non-main), excluding "Scene Directions" in each episode.
line_distribution <- friends |>
  filter(speaker != "Scene Directions") |>  # Exclude "Scene Directions"
  group_by(season, episode, speaker) |>   # Group by season, episode, and speaker
  summarise(total_lines = n(), .groups = 'drop')  # Count the number of lines for each speaker
# Find the speaker with the most lines in each episode of each season
most_lines_per_episode <- line_distribution |>
  group_by(season, episode) |>
  filter(total_lines == max(total_lines))

# Output the result
print(most_lines_per_episode)
## # A tibble: 244 Ă— 4
## # Groups:   season, episode [236]
##    season episode speaker       total_lines
##     <int>   <int> <chr>               <int>
##  1      1       1 Monica Geller          73
##  2      1       2 Ross Geller            68
##  3      1       3 Monica Geller          52
##  4      1       4 Monica Geller          47
##  5      1       5 Ross Geller            40
##  6      1       6 Chandler Bing          58
##  7      1       7 Ross Geller            53
##  8      1       8 Ross Geller            61
##  9      1       9 Monica Geller          48
## 10      1      10 Phoebe Buffay          51
## # ℹ 234 more rows
# Filter for episodes where the speaker with the most lines are non-main characters.
non_main_results <- most_lines_per_episode |>
  filter(!(speaker %in% main_characters))

# Output the result
if (nrow(non_main_results) > 0) {
  print("Episodes where a non-main character has the most lines:")
  print(non_main_results)
} else {
  print("No episodes where a non-main character has the most lines.")
}
## [1] "Episodes where a non-main character has the most lines:"
## # A tibble: 2 Ă— 4
## # Groups:   season, episode [2]
##   season episode speaker      total_lines
##    <int>   <int> <chr>              <int>
## 1      6      21 Paul Stevens          44
## 2      9       8 Amy Green             58

How many times did Joey say his famous line”How you doin?” What is the distribution of the frequency of this line?

# Filter for lines where the speaker is Joey
joey_lines <- subset(friends, speaker == "Joey Tribbiani")

# Count the occurrences of "How you doin?" in the 'text' column (case insensitive)
how_you_doin_count <- sum(grepl("How you doin?", joey_lines$text, ignore.case = TRUE))

# Print the result
cat("Joey says 'How you doin?'", how_you_doin_count, "times.\n")
## Joey says 'How you doin?' 25 times.
# Count occurrences of "How you doin?" in each season
how_you_doin_by_season <- aggregate(
  grepl("How you doin?", joey_lines$text, ignore.case = TRUE) ~ joey_lines$season,
  data = joey_lines,
  FUN = sum
)

# Rename columns for clarity
colnames(how_you_doin_by_season) <- c("season", "count")

# Load ggplot2 library for plotting
library(ggplot2)

# Create a line plot with customized x-axis
ggplot(how_you_doin_by_season, aes(x = season, y = count)) +
  geom_line(color = "blue", size = 1) +        # Line with color and thickness
  geom_point(color = "red", size = 3) +        # Points at each data value
  labs(
    title = "Number of Times Joey Says 'How you doin?' by Season",
    x = "Season",
    y = "Count"
  ) +
  scale_x_continuous(breaks = 0:10, limits = c(0, 10)) +  # X-axis from 0 to 10
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

How many times did the six main characters say “I love you” What is the distribution of the frequency of this line?

# Count occurrences of "I love you" for each main character
love_you_count <- friends |>
  filter(speaker %in% main_characters & str_detect(text, "I love you")) |>  # Filter main characters and text
  group_by(speaker) |>                        # Group by speaker
  summarise(total_count = n()) |>            # Count occurrences
  arrange(desc(total_count))                   # Sort by total count in descending order

# Output the total counts for each character
cat("Occurrences of 'I love you' by each main character:\n")
## Occurrences of 'I love you' by each main character:
print(love_you_count)
## # A tibble: 6 Ă— 2
##   speaker        total_count
##   <chr>                <int>
## 1 Monica Geller           45
## 2 Chandler Bing           33
## 3 Ross Geller             31
## 4 Rachel Green            25
## 5 Phoebe Buffay           19
## 6 Joey Tribbiani          11
# Identify the character who said "I love you" the most
most_love = love_you_count |>
  filter(total_count == max(total_count))

cat("The character who said 'I love you' the most is:", most_love$speaker, "with", most_love$total_count, "occurrences.\n")
## The character who said 'I love you' the most is: Monica Geller with 45 occurrences.
# Count occurrences of "I love you" for the main characters by season
love_you_by_season <- friends |>
  filter(speaker %in% main_characters & str_detect(text, "I love you")) |>
  group_by(season, speaker) |>
  summarise(total_count = n(), .groups = "drop") |>
  arrange(season)


# Create a plot for the counts
ggplot(love_you_by_season, aes(x = season, y = total_count, color = speaker)) +
  geom_line(size = 1) +                                  # Add lines for each character
  geom_point(size = 3) +                                 # Add points to the lines
  theme_minimal() +
  labs(
    title = "Occurrences of 'I love you' by six main charactors Through the Seasons",
    x = "Season",
    y = "Total Occurrences"
  ) +
  scale_x_continuous(breaks = 1:10, limits = c(1, 10)) +  # Set x-axis from 1 to 10 (seasons)
  scale_y_continuous(breaks = seq(0, max(love_you_by_season$total_count), by = 1)) +  # Set y-axis as integers
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

Fit two linear models, one with us_views_millons being x, imdb_rating being y. The other with imdb_rating being x and us_views_millions being y.And compare them and decide which model makes more sense in reality.

fit <- lm(imdb_rating ~ us_views_millions, data = friends_info)
plot(friends_info$us_views_millions, friends_info$imdb_rating)

fit.1 <- lm(us_views_millions ~ imdb_rating, data = friends_info)
plot(friends_info$imdb_rating, friends_info$us_views_millions)

#The model with us_views_millions being x and imbd_rating being y makes more sense as x values are collected 10 years before y values are collected.

#plot the model and find the coefficients for this model.
plot(fit)

summary(fit)
## 
## Call:
## lm(formula = imdb_rating ~ us_views_millions, data = friends_info)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.15016 -0.24770 -0.02827  0.22250  1.17155 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       7.731888   0.119473  64.717  < 2e-16 ***
## us_views_millions 0.028757   0.004613   6.234  2.1e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3699 on 234 degrees of freedom
## Multiple R-squared:  0.1424, Adjusted R-squared:  0.1388 
## F-statistic: 38.87 on 1 and 234 DF,  p-value: 2.097e-09

#Remove Outliers to see if the four outliers make the model change significantly.

friends_info <- friends_info |>
  filter(!(row_number() %in% c(36,37,235, 236)))
fit <- lm(imdb_rating ~ us_views_millions, data = friends_info)

plot(fit)

summary(fit)
## 
## Call:
## lm(formula = imdb_rating ~ us_views_millions, data = friends_info)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14531 -0.24731 -0.02954  0.21320  1.16609 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       7.691313   0.158578  48.502  < 2e-16 ***
## us_views_millions 0.030419   0.006295   4.832 2.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3669 on 230 degrees of freedom
## Multiple R-squared:  0.09217,    Adjusted R-squared:  0.08822 
## F-statistic: 23.35 on 1 and 230 DF,  p-value: 2.467e-06

The outliers don’t change the model much. Keep the outliers

friends <- read.csv("friends.csv", sep = ",", header = TRUE)
friends_info <- read.csv("friends_info.csv", sep = ",", header = TRUE)
friends_emotions <- read.csv("friends_emotions.csv", sep = ",", header = TRUE)
fit <- lm(imdb_rating ~ us_views_millions, data = friends_info)

plot(fit)

summary(fit)
## 
## Call:
## lm(formula = imdb_rating ~ us_views_millions, data = friends_info)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.15016 -0.24770 -0.02827  0.22250  1.17155 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       7.731888   0.119473  64.717  < 2e-16 ***
## us_views_millions 0.028757   0.004613   6.234  2.1e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3699 on 234 degrees of freedom
## Multiple R-squared:  0.1424, Adjusted R-squared:  0.1388 
## F-statistic: 38.87 on 1 and 234 DF,  p-value: 2.097e-09

Use boxcox method to try transformations for y.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
boxcox(fit)

Boxcox method indicates transformation of log(y),reciprocal(y), sqrt(y) and no transformation.

fit.reciprocal <- lm(1 /imdb_rating ~ us_views_millions, data = friends_info)
fit.log <- lm(log(imdb_rating) ~ us_views_millions, data = friends_info)
fit.sqrt <- lm(sqrt(imdb_rating) ~ us_views_millions, data = friends_info)

#Plot the transformation models and see if there are improvement in the residuals

plot(fit.reciprocal)

plot(fit.log)

plot(fit.sqrt)

No obvious improvement. Don’t apply transformations.